
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

[-
our $type;
our $dtype       = $type eq 'h' ?         'U16' :  '32';
our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
our $dshift      = $type eq 'h' ?           '1' :   '2';
our $dsize       = $type eq 'h' ?           '2' :   '4';
our $vsize       = $type eq 'h' ?          '64' : '128';
sub dtype  { return $dtype;  }
sub dsize  { return $dsize;  }
sub dshift { return $dshift; }
sub vsize  { return $vsize;  }
-]

<CONSTANT_MAPPING>

    addr_zero   : 4x<32*36*2*4 + 64 + 0>
    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>

    param_S[0]           : c[0x0][0x140]
    param_S[1]           : c[0x0][0x144]
    param_X[0]           : c[0x0][0x148]
    param_X[1]           : c[0x0][0x14c]
    param_O[0]           : c[0x0][0x150]
    param_O[1]           : c[0x0][0x154]
    param_I[0]           : c[0x0][0x158]
    param_I[1]           : c[0x0][0x15c]
    param_F[0]           : c[0x0][0x160]
    param_F[1]           : c[0x0][0x164]
    param_alpha          : c[0x0][0x168]
    param_beta           : c[0x0][0x16c]
    param_flags          : c[0x0][0x170]
    param_C              : c[0x0][0x174]
    param_K              : c[0x0][0x178]
    param_N              : c[0x0][0x17c]
    param_Xk             : c[0x0][0x180]
    param_k              : c[0x0][0x184]
    param_magic_Xk       : c[0x0][0x188]
    param_shift_Xk       : c[0x0][0x18c]
    param_magic_k        : c[0x0][0x190]
    param_shift_k        : c[0x0][0x194]
    param_C_1152         : c[0x0][0x198]
    param_GXS_C_1152     : c[0x0][0x19c]
    param_GYS_GXS_C_1152 : c[0x0][0x1a0]
    param_P              : c[0x0][0x1a4]
    param_Q              : c[0x0][0x1a8]
    param_QN             : c[0x0][0x1ac]
    param_PQN            : c[0x0][0x1b0]
    param_PQN15          : c[0x0][0x1b4]
    param_maskN          : c[0x0][0x1b8]
    param_shiftX         : c[0x0][0x1bc]
    param_shiftY         : c[0x0][0x1c0]
    param_superX         : c[0x0][0x1c4]
    param_superY         : c[0x0][0x1c8]
    param_gridN          : c[0x0][0x1cc]
    param_gridQN         : c[0x0][0x1d0]
    param_gridPQN        : c[0x0][0x1d4]

</CONSTANT_MAPPING>

<REGISTER_MAPPING>

       0-63 : czero<00-63>

     3, 2,11,10 : clx<0-3>y0
     7, 6,15,14 : clx<0-3>y1
     1, 0, 9, 8 : clx<0-3>y2
     5, 4,13,12 : clx<0-3>y3
    19,18,27,26 : clx<0-3>y4
    23,22,31,30 : clx<0-3>y5
    17,16,25,24 : clx<0-3>y6
    21,20,29,28 : clx<0-3>y7

      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
      36-39 : jl1Fy<0-3>

      52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3>
      88-89 : track<0-1>
      90-91 ~ writeS

      32-39 ~ partialC, idx_K, idx_Y, idx_X
      40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, c, offset, idx_N

      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16


     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
    35,34,43,42,51,50,59,58 : ccx<0-7>y4
    39,38,47,46,55,54,63,62 : ccx<0-7>y5
    33,32,41,40,49,48,57,56 : ccx<0-7>y6
    37,36,45,44,53,52,61,60 : ccx<0-7>y7

      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
      64-67 : jc1Ix<0-3>

      64-86 ~ tid16, tid_1, tid128

         87 = tid
      92-95 ~ C, swapBuf, readFs, readIs

      64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q
      86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset

      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1

      // t00 80      r00 78
      // t10 m10     r01 w01
      // t20 m20     r02 w02
      // t30 m30     r03 w03
      // w00 m00     s00 w00
      // w30 m40     s01 w01
      // w10 m10     s02 w02
      // w20 m20     s03 w04

      78 = t0<0-5>, r<0-3>0
      79 = temp

       3, 2,11,10,19,18 : m<0-5>0
       1, 9, 0, 8,17,16 : m<0-5>1
      27,26,25,24,64,65 : m<0-5>2
                2,11,10 : t10, t20, t30
                9, 0, 8 : t11, t21, t31
               26,25,24 : t12, t22, t32
             3, 2,11,19 : w00, w10, w20, w30
             1, 9, 0,17 : w01, w11, w21, w31
            27,26,25,64 : w02, w12, w22, w32

      66,67,68,69,70,71 : m<0-5>3
      72,73,74,75,76,77 : m<0-5>4
       8,24,10,65,16,18 : m<0-5>5
               67,68,69 : t13, t23, t33
               73,74,75 : t14, t24, t34
               24,10,65 : t15, t25, t35
            66,67,68,70 : w03, w13, w23, w33
            72,73,74,76 : w04, w14, w24, w34
             8,24,10,16 : w05, w15, w25, w35

                1,27,66 : r01, r02, r03
                9,26,67 : r11, r12, r13
                0,25,68 : r21, r22, r23
               17,64,70 : r31, r32, r33
             3, 1,27,72 : s00, s01, s02, s03
             2, 9,26,73 : s10, s11, s12, s13
            11, 0,25,74 : s20, s21, s22, s23
            19,17,64,76 : s30, s31, s32, s33

                  80-83 ~ xx<0-3>
                  78-81 ~ sum<0-3>
                  82-83 : Sum<0-1>
                  84-85 : Out<0-1>

             8,10,16,18 ~ b0<0-3>
            24,65,66,67 ~ b1<0-3>
            68,69,70,71 ~ b2<0-3>
            75,77,78,79 ~ b3<0-3>

</REGISTER_MAPPING>

--:-:-:-:0      MOV C,   param_C;
--:-:1:-:1      S2R tid, SR_TID.X;
--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
--:-:-:-:1      STS.128 [addr_zero], RZ;
--:-:-:Y:c      LOP.AND partialC, C, 1;
--:-:-:-:0      IADD C, C, partialC;
--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;

##############################################################
LOAD_SETUP:

--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;

<SCHEDULE_BLOCK>

[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]

--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;

// idx_Y   = idx_YXk / blk_Xk
--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
--:-:-:-:1      IADD negXk, RZ, -param_Xk;
--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
--:-:-:-:1  @P3 IADD3.RS idx_Y, div1, div2, div3;
--:-:-:-:1  @P3 SHR.U32  idx_Y, idx_Y,   param_shift_Xk;
--:-:-:-:1 @!P3 SHR.U32  idx_Y, idx_YXk, param_shift_Xk;

// idx_Xk  = idx_YXk % blk_Xk
--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y, idx_YXk;

// idx_X   = idx_Xk / blk_k
// idx_k   = idx_Xk % blk_k
--:-:-:-:1      XMAD    idx_X,  idx_Xk, param_magic_k, RZ;
--:-:-:-:1      SHR.U32 idx_X,  idx_X,  param_shift_k;
--:-:-:-:1      XMAD    idx_k,  idx_X,  param_k, RZ;
--:-:-:-:1      IADD    idx_k, -idx_k,  idx_Xk;

// idx_K = idx_K * blk_k + idx_k
02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;

--:-:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
--:-:-:-:1  @P0 STS [addr_idx_X], idx_X;
--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;


--:-:-:-:1      LOP.AND  tid32_2,  tid,    -32;
--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;

// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
--:-:-:-:1      SHL     readIs, readIs, 4;

// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readFs, tid,    16;
--:-:-:-:1      SHR.U32 readFs, readFs, 3;
--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;

// c = (tid & 63) >> 5
--:-:-:-:1      BFE.U32 c, tid, 0x105; // 2 bits at position 5

// partialC  = (2 - partialC)
// P6        = c < partialC
// partialC *= 32*36 * itemsize
--:-:-:-:1      IADD partialC, -partialC, 2;
--:-:-:-:1      ISETP.LT.AND P6, PT, c, partialC, PT;
--:-:-:-:1      XMAD partialC,  partialC, 1x<32*36 * $dsize>, RZ;

// writeS = (c*32*36 + (tid & 31)*4)*4
--:-:-:-:1      LOP.AND tid31, tid, 31;
--:-:-:-:1      SHL writeS, tid31, 4;
--:-:-:-:1      XMAD writeS, c, 4x<32*36>, writeS;

// offset = c*32*36 + tid31*4
--:-:-:-:1      SHL tid31, tid31, 2;
--:-:-:-:1      XMAD offset, c, 1x<32*36>, tid31;


// P5 = C > 2
--:-:-:-:1      ISETP.GT.AND P5, PT, C, 2, PT;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P1 BRA.U FILTER_SETUP;

##############################################################
IMAGE_SETUP:

--:-:1:-:1      S2R idx_N, SR_CTAID.Z;
<SCHEDULE_BLOCK>
// (GN,GYS,GXS,C,6,6,32)
// offset += (idx_N*GYS*GXS*C*32*36 + idx_Y*GXS*C*32*36 + idx_X*C*32*36) * itemsize;
--:-:-:-:1      XMAD.LO2C offset, idx_X, param_C_1152, offset;
--:-:-:-:1      XMAD.LO2C offset, idx_Y, param_GXS_C_1152, offset;
01:-:-:-:1      XMAD.LO2C offset, idx_N, param_GYS_GXS_C_1152, offset;
--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dshift() +];
--:-:-:-:0      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dshift() +];
</SCHEDULE_BLOCK>

--:-:-:-:5      BRA.U LOAD;

##############################################################
FILTER_SETUP:

<SCHEDULE_BLOCK>
// writeS += 32*36*2*4
--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;

// (kBlks,C,6,6,32)
// offset += (idx_K*C*32*36) * itemsize;
--:-:-:-:1      XMAD.LO2C offset, idx_K, param_C_1152, offset;
--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
--:-:-:-:2      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
</SCHEDULE_BLOCK>

##############################################################
LOAD:

--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>];
--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>];
--:-:2:-:1  @P6 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>];

--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T0, [addr_zero];
--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T1, [addr_zero];
--:-:2:-:1 @!P6 LDS.U.[+ vsize() +] T2, [addr_zero];

--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>];
--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>];
--:-:3:-:1  @P6 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>];

--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T3, [addr_zero];
--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T4, [addr_zero];
--:-:3:-:1 @!P6 LDS.U.[+ vsize() +] T5, [addr_zero];

--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>];
--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>];
--:-:4:-:1  @P6 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>];

--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T6, [addr_zero];
--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T7, [addr_zero];
--:-:4:-:1 @!P6 LDS.U.[+ vsize() +] T8, [addr_zero];

[+
    our $convert_in;
    return $convert_in ? q{

02:-:-:-:1      F2F.F32.F16 T03, T01.H1;
--:-:-:-:1      F2F.F32.F16 T02, T01.H0;
--:-:-:-:1      F2F.F32.F16 T01, T00.H1;
--:-:2:-:1      F2F.F32.F16 T00, T00.H0;

--:-:-:-:1      F2F.F32.F16 T13, T11.H1;
--:-:-:-:1      F2F.F32.F16 T12, T11.H0;
--:-:-:-:1      F2F.F32.F16 T11, T10.H1;
--:-:5:-:1      F2F.F32.F16 T10, T10.H0;

--:-:-:-:1      F2F.F32.F16 T23, T21.H1;
--:-:-:-:1      F2F.F32.F16 T22, T21.H0;
--:-:-:-:1      F2F.F32.F16 T21, T20.H1;
--:-:6:-:1      F2F.F32.F16 T20, T20.H0;

02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;

04:-:-:-:1      F2F.F32.F16 T33, T31.H1;
--:-:-:-:1      F2F.F32.F16 T32, T31.H0;
--:-:-:-:1      F2F.F32.F16 T31, T30.H1;
--:-:3:-:1      F2F.F32.F16 T30, T30.H0;

10:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;

--:-:-:-:1      F2F.F32.F16 T43, T41.H1;
--:-:-:-:1      F2F.F32.F16 T42, T41.H0;
--:-:-:-:1      F2F.F32.F16 T41, T40.H1;
--:-:5:-:1      F2F.F32.F16 T40, T40.H0;

20:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;

--:-:-:-:1      F2F.F32.F16 T53, T51.H1;
--:-:-:-:1      F2F.F32.F16 T52, T51.H0;
--:-:-:-:1      F2F.F32.F16 T51, T50.H1;
--:-:6:-:1      F2F.F32.F16 T50, T50.H0;

04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;

08:-:-:-:1      F2F.F32.F16 T63, T61.H1;
--:-:-:-:1      F2F.F32.F16 T62, T61.H0;
--:-:-:-:1      F2F.F32.F16 T61, T60.H1;
--:-:4:-:1      F2F.F32.F16 T60, T60.H0;

10:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;

--:-:-:-:1      F2F.F32.F16 T73, T71.H1;
--:-:-:-:1      F2F.F32.F16 T72, T71.H0;
--:-:-:-:1      F2F.F32.F16 T71, T70.H1;
--:-:5:-:1      F2F.F32.F16 T70, T70.H0;

20:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;

--:-:-:-:1      F2F.F32.F16 T83, T81.H1;
--:-:-:-:1      F2F.F32.F16 T82, T81.H0;
--:-:-:-:1      F2F.F32.F16 T81, T80.H1;
--:-:6:-:1      F2F.F32.F16 T80, T80.H0;

08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
10:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
20:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;

    } : q{
02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
--:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
--:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
--:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
--:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
--:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
--:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
    };
+]

--:-:-:-:0      IADD   track0.CC, track0, partialC;

--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeS, writeS, swapBuf;
--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;

--:-:-:-:0      IADD.X track1,    track1, RZ;

--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];

--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>];
--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>];
--:-:2:-:1  @P5 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>];
--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>];
--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>];
--:-:3:-:1  @P5 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>];
--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>];
--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>];
--:6:4:-:1  @P5 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>];

--:-:-:-:5      BRA.U LOAD_LOOP;

##############################################################

COMPUTE_SETUP:

<SCHEDULE_BLOCK>
[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]

--:-:-:-:1      IADD tid128, tid, -128;

// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
--:-:-:-:1      SHR.U32  tid16,  tid16,   1;

--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4>, 4;

--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
--:-:-:-:1      LOP.AND  readFs, tid128, 8;
--:-:-:-:1      SHR.U32  readFs, readFs, 2;
--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2>, 4;
</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];

COMPUTE_LOOP:
[+
    my %insert = (

        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
                 "--:-:-:-:1      IADD C, C, -2;\n",

        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1      IADD readFs, readFs, swapBuf;\n" .
                 "--:-:-:-:1      IADD readIs, readIs, swapBuf;\n" .
                 "--:-:-:-:1      IADD swapBuf, RZ,   -swapBuf;\n",

        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
    );
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }
    my $out;
    foreach my $j (0 .. 1)
    {
        my $odd    = $j;
        my $nOdd   = 1 - $j;
        my $rsPred = $j == 1 ? '@P0' : '   ';
        my $bar    = $j == 0 ? '2' : '-';

        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;

        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;


        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;

            my $yield  = $c % 10 == 0 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

LOAD_LOOP:
--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;
20:-:-:-:1      IADD track0.CC, track0, 1x<32*36*2 * $dsize>;
--:-:-:-:1      ISETP.GT.AND P1, PT, C, 4, PT;
--:-:-:-:1      IADD C, C, -2;
[+
    our ($vsize, $dsize, $convert_in);
    my %insert = (

        j0c3 => "--:-:-:-:1      IADD.X track1, track1, RZ;\n",

        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",

        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",

        $convert_in ? (

            j0c1  => "02:-:-:-:1      F2F.F32.F16 T03, T01.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T02, T01.H0;\n",
            j0c4  => "--:-:-:-:1      F2F.F32.F16 T01, T00.H1;\n" .
                     "--:-:2:-:1      F2F.F32.F16 T00, T00.H0;\n",

            j0c5  => "--:-:-:-:1      F2F.F32.F16 T13, T11.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T12, T11.H0;\n",
            j0c6  => "--:-:-:-:1      F2F.F32.F16 T11, T10.H1;\n" .
                     "--:-:5:-:1      F2F.F32.F16 T10, T10.H0;\n",

            j0c7  => "--:-:-:-:1      F2F.F32.F16 T23, T21.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T22, T21.H0;\n",
            j0c8  => "--:-:-:-:1      F2F.F32.F16 T21, T20.H1;\n" .
                     "--:-:6:-:1      F2F.F32.F16 T20, T20.H0;\n",

            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n",
            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",

            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n",
            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n",
            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n",

            j0c16 => "04:-:-:-:1      F2F.F32.F16 T33, T31.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T32, T31.H0;\n",
            j0c17 => "--:-:-:-:1      F2F.F32.F16 T31, T30.H1;\n" .
                     "--:-:3:-:1      F2F.F32.F16 T30, T30.H0;\n",

            j0c19 => "--:-:-:-:1      F2F.F32.F16 T43, T41.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T42, T41.H0;\n",
            j0c20 => "--:-:-:-:1      F2F.F32.F16 T41, T40.H1;\n" .
                     "--:-:5:-:1      F2F.F32.F16 T40, T40.H0;\n",

            j0c21 => "--:-:-:-:1      F2F.F32.F16 T53, T51.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T52, T51.H0;\n",
            j0c22 => "--:-:-:-:1      F2F.F32.F16 T51, T50.H1;\n" .
                     "--:-:6:-:1      F2F.F32.F16 T50, T50.H0;\n",

            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",

            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n",
            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n",
            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n",

            j0c30 => "08:-:-:-:1      F2F.F32.F16 T63, T61.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T62, T61.H0;\n",
            j0c31 => "--:-:-:-:1      F2F.F32.F16 T61, T60.H1;\n" .
                     "--:-:4:-:1      F2F.F32.F16 T60, T60.H0;\n",

            j1c0  => "--:-:-:-:1      F2F.F32.F16 T73, T71.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T72, T71.H0;\n",
            j1c1  => "--:-:-:-:1      F2F.F32.F16 T71, T70.H1;\n" .
                     "--:-:5:-:1      F2F.F32.F16 T70, T70.H0;\n",

            j1c2  => "--:-:-:-:1      F2F.F32.F16 T83, T81.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T82, T81.H0;\n",
            j1c3  => "--:-:-:-:1      F2F.F32.F16 T81, T80.H1;\n" .
                     "--:-:6:-:1      F2F.F32.F16 T80, T80.H0;\n",

            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",

            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n",
            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n",
            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n",

        ) : (

            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;\n",
            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",

            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n",
            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n",
            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n",

            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",

            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n",
            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n",
            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n",

            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",

            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n",
            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n",
            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n",
        ),

        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",

        j1c31 => "--:-:-:Y:5  \@P0 BRA.U LOAD_LOOP;\n",
    );

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
    {
        my ($x, $y) = @$xy;
        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
    }
    my $out;
    foreach my $j (0 .. 1)
    {
        foreach my $c (0 .. 31)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $wait   = $c == 0 ? "01" : '--';

            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;

            my $ctrl   = "$wait:-:-:-:$stall";

            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

[-
    our $trans1 = "0.343";
    our $trans2 = "0.700";
    our $trans3 = "0.490";
-]

<INCLUDE file="xconv_winograd_4x4_3x3_32x32_common.sass"/>
